03 - Review cleaning

Author

Yann Say

Published

January 2, 2025

library(cleaningtools)
library(dplyr)

my_raw_dataset <- cleaningtools::cleaningtools_raw_data
my_kobo_survey <- cleaningtools::cleaningtools_survey
my_kobo_choice <- cleaningtools::cleaningtools_choices
my_filled_log <- readxl::read_excel("../inputs/02 - example - cleaning-log-with-kobo - filled.xlsx", sheet = 2)

my_clean_data <- create_clean_data(raw_dataset = my_raw_dataset,
                                   raw_data_uuid_column = "X_uuid",
                                   cleaning_log = my_filled_log, 
                                   cleaning_log_uuid_column = "uuid",
                                   cleaning_log_question_column = "question",
                                   cleaning_log_new_value_column = "new_value",
                                   cleaning_log_change_type_column = "change_type")
my_clean_data2 <- recreate_parent_column(dataset = my_clean_data,
                                         uuid_column = "X_uuid",
                                         kobo_survey = my_kobo_survey,
                                         kobo_choices = my_kobo_choice,
                                         sm_separator = ".", 
                                         cleaning_log_to_append = my_filled_log)

review_others

In the cleaning log, some opentext values are changed to blank. Some open text questions are linked some skip logic, i.e. what is X? Other, please specify. In some cases, values some values should be changed.

In the example below, the value for water_supply_other_neighbourhoods_why for the uuid 019bc718-c06a-46b8-bba8-c84f6c6efbd5 was changed to NA.

my_filled_log %>% 
  filter(question == "water_supply_other_neighbourhoods_why", 
         change_type == "blank_response")
uuid old_value question issue check_id check_binding change_type new_value enumerator_num
019bc718-c06a-46b8-bba8-c84f6c6efbd5 لا اعلم water_supply_other_neighbourhoods_why recode other NA water_supply_other_neighbourhoods_why / 019bc718-c06a-46b8-bba8-c84f6c6efbd5 blank_response NA 12

The kobo show a skip logic based on water_supply_other_neighbourhoods.

my_kobo_survey %>% 
  filter(name == "water_supply_other_neighbourhoods_why") %>% 
  select(type, name, relevant)
type name relevant
text water_supply_other_neighbourhoods_why selected(\({water_supply_other_neighbourhoods},'somewhat_worse') or selected(\){water_supply_other_neighbourhoods},‘much_worse’)
my_clean_data %>% 
  filter(X_uuid == "019bc718-c06a-46b8-bba8-c84f6c6efbd5") %>% 
  select(water_supply_other_neighbourhoods, water_supply_other_neighbourhoods_why   )
water_supply_other_neighbourhoods water_supply_other_neighbourhoods_why
somewhat_worse NA

Should the value of water_supply_other_neighbourhoods be changed? It depends on the question and skip logic but it important to flag those so a decision can be taken.

review_other_log <- review_others(dataset = my_clean_data2$data_with_fix_concat,
                                  uuid_column = "X_uuid", 
                                  kobo_survey = my_kobo_survey, 
                                  columns_not_to_check = "consent_telephone_number")
Warning in create_logic_for_other(kobo_survey = kobo_survey,
compare_with_dataset = TRUE, : The following parent names: well_quality,
spring_quality, rainwater_quality, surface_quality, why_not_connected were not
found in the dataset. The function is ignoring them.

review_cleaning

my_deletion_log <- my_clean_data2$cleaning_log %>% 
  filter(change_type == "remove_survey")

my_filled_log_no_deletion <- my_clean_data2$cleaning_log %>% 
  filter(change_type != "remove_survey") %>% 
  filter(!uuid %in% my_deletion_log$uuid)

review_of_cleaning <- review_cleaning(raw_dataset = my_raw_dataset,
                    raw_dataset_uuid_column = "X_uuid", 
                    clean_dataset = my_clean_data2$data_with_fix_concat,
                    clean_dataset_uuid_column = "X_uuid",
                    cleaning_log = my_filled_log_no_deletion, 
                    cleaning_log_uuid_column = "uuid",
                    cleaning_log_question_column = "question",
                    cleaning_log_new_value_column = "new_value",
                    cleaning_log_change_type_column = "change_type", 
                    cleaning_log_old_value_column = "old_value", 
                    deletion_log = my_deletion_log, 
                    deletion_log_uuid_column = "uuid"
                    )
review_of_cleaning
uuid df.question df.change_type df.new_value cl.new_value df.old_value cl.old_value comment

Downloads